library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(leaps)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(class)
First we import data and see different parameters of this dataset. Therefore we have something in mind about which parameters are quantiliative and which parameters are qualitative.
#Import data
train <- read.csv('train.csv', header = TRUE)
test <- read.csv('test.csv', header = TRUE)
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour Utilities
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63 AllPub:1459
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50 NoSeWa: 1
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## LotConfig LandSlope Neighborhood Condition1 Condition2
## Corner : 263 Gtl:1382 NAmes :225 Norm :1260 Norm :1445
## CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81 Feedr : 6
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48 Artery : 2
## FR3 : 4 Edwards:100 RRAn : 26 PosN : 2
## Inside :1052 Somerst: 86 PosN : 19 RRNn : 2
## Gilbert: 79 RRAe : 11 PosA : 1
## (Other):707 (Other): 15 (Other): 2
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872
## 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973
## Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971
## TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 19
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## Min. :1950 Flat : 13 CompShg:1434 VinylSd:515 VinylSd:504
## 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222 MetalSd:214
## Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220 HdBoard:207
## Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206 Wd Sdng:197
## 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108 Plywood:142
## Max. :2010 Shed : 2 Membran: 1 CemntBd: 61 CmentBd: 60
## (Other): 2 (Other):128 (Other):136
## MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual
## BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3 BrkTil:146 Ex :121
## BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28 CBlock:634 Fa : 35
## None :864 Median : 0.0 Gd:488 Gd: 146 PConc :647 Gd :618
## Stone :128 Mean : 103.7 TA:906 Po: 1 Slab : 24 TA :649
## NA's : 8 3rd Qu.: 166.0 TA:1282 Stone : 6 NA's: 37
## Max. :1600.0 Wood : 3
## NA's :8
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Fa : 45 Av :221 ALQ :220 Min. : 0.0 ALQ : 19
## Gd : 65 Gd :134 BLQ :148 1st Qu.: 0.0 BLQ : 33
## Po : 2 Mn :114 GLQ :418 Median : 383.5 GLQ : 14
## TA :1311 No :953 LwQ : 74 Mean : 443.6 LwQ : 46
## NA's: 37 NA's: 38 Rec :133 3rd Qu.: 712.2 Rec : 54
## Unf :430 Max. :5644.0 Unf :1256
## NA's: 37 NA's: 38
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49
## Median : 0.00 Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 46.55 Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :1474.00 Max. :2336.0 Max. :6110.0 Wall : 4
##
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## N: 95 FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## Y:1365 FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Gd:586 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## Maj1: 14 Min. :0.000 Ex : 24 2Types : 6 Min. :1900
## Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870 1st Qu.:1961
## Min1: 31 Median :1.000 Gd :380 Basment: 19 Median :1980
## Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88 Mean :1979
## Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9 3rd Qu.:2002
## Sev : 1 Max. :3.000 NA's:690 Detchd :387 Max. :2010
## Typ :1360 NA's : 81 NA's :81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## Fin :352 Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2
## RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48 Fa : 35
## Unf :605 Median :2.000 Median : 480.0 Gd : 14 Gd : 9
## NA's: 81 Mean :1.767 Mean : 473.0 Po : 3 Po : 7
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311 TA :1326
## Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
##
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Y:1340 Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
## Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
## Mean : 15.06 Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 NA's:1406
## Max. :480.00 Max. :738.000
##
## MiscVal MoSold YrSold SaleType
## Min. : 0.00 Min. : 1.000 Min. :2006 WD :1267
## 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 New : 122
## Median : 0.00 Median : 6.000 Median :2008 COD : 43
## Mean : 43.49 Mean : 6.322 Mean :2008 ConLD : 9
## 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009 ConLI : 5
## Max. :15500.00 Max. :12.000 Max. :2010 ConLw : 5
## (Other): 9
## SaleCondition SalePrice
## Abnorml: 101 Min. : 34900
## AdjLand: 4 1st Qu.:129975
## Alloca : 12 Median :163000
## Family : 20 Mean :180921
## Normal :1198 3rd Qu.:214000
## Partial: 125 Max. :755000
##
library(ggplot2)
par(mfrow=c(1,2))
ggplot(data = train, aes(x = SalePrice)) + geom_histogram(bins = 50)
ggplot(data = train, aes(x = log(SalePrice+1))) + geom_histogram(bins = 50)
In the next couple steps, we discuss each parameters in the dataset and preprossing the whole dataset. Therefore we can have a nice dataset to drive into our modeling in the next couple steps. Also, It’s very important to be noticed that we use log transformation in Saleprice or y in this problem since we discover some right-skew distribution in the original dataset.
train$SalePrice <- log(train$SalePrice+1)
# save the id columns that are needed to compose the submission file
train.id <- train$Id
test.id <- test$Id
# add SalePrice column to test set
test$SalePrice <- NA
# combine the training and test sets
combined <- rbind(train, test)
# drop id column which is unnecessary for the prediction
combined <- combined[, -1]
combined$GarageYrBlt[which(combined$GarageYrBlt == 2207)] <- 2007
library(e1071)
# convert MSSubclass that represents categorical variable to factor
combined$MSSubClass <- as.factor(combined$MSSubClass)
# convert MoSold and YrSold to factors
combined$MoSold <- as.factor(combined$MoSold)
combined$YrSold <- as.factor(combined$YrSold)
# impute LotFrontage with mean
combined$LotFrontage[is.na(combined$LotFrontage)] <- mean(combined$LotFrontage[!is.na(combined$LotFrontage)])
# MasVnrArea
combined$MasVnrArea[is.na(combined$MasVnrType)] <- 0
# Basement
combined$BsmtFinSF1[is.na(combined$BsmtFinSF1) & is.na(combined$BsmtQual)] <- 0
combined$BsmtFinSF2[is.na(combined$BsmtFinSF2) & is.na(combined$BsmtQual)] <- 0
combined$BsmtUnfSF[is.na(combined$BsmtUnfSF) & is.na(combined$BsmtQual)] <- 0
combined$TotalBsmtSF[is.na(combined$TotalBsmtSF) & is.na(combined$BsmtQual)] <- 0
# Bath
combined$BsmtFullBath[is.na(combined$BsmtFullBath)] <- 0
combined$BsmtHalfBath[is.na(combined$BsmtHalfBath)] <- 0
# Garage
combined$GarageCars[is.na(combined$GarageCars)] <- 0
combined$GarageYrBlt[is.na(combined$GarageYrBlt)] <- combined$YearBuilt[is.na(combined$GarageYrBlt)]
combined$GarageArea[is.na(combined$GarageArea)] <- 0
# fix skewness for numeric variables
var.classes <- sapply(names(combined), function(x){class(combined[[x]])})
numeric.col <- names(var.classes[var.classes != "factor"])
numeric.col <- numeric.col[-length(numeric.col)]
skew <- sapply(numeric.col, function(x){skewness(combined[[x]], na.rm = TRUE)})
skew <- skew[skew > 0.75]
for(x in names(skew)) {
combined[[x]] <- log(combined[[x]] + 1)
}
# Alley
combined$Alley <- as.character(combined$Alley)
combined$Alley[is.na(combined$Alley)] <- "None"
combined$Alley <- as.factor(combined$Alley)
# drop Utilities that is not significant for prediction
combined$Utilities <- NULL
# Basement
combined$BsmtQual <- as.character(combined$BsmtQual)
combined$BsmtCond <- as.character(combined$BsmtCond)
combined$BsmtExposure <- as.character(combined$BsmtExposure)
combined$BsmtFinType1 <- as.character(combined$BsmtFinType1)
combined$BsmtFinType2 <- as.character(combined$BsmtFinType2)
combined$BsmtQual[is.na(combined$BsmtQual)] <- "None"
combined$BsmtCond[is.na(combined$BsmtCond)] <- "None"
combined$BsmtExposure[is.na(combined$BsmtExposure)] <- "None"
combined$BsmtFinType1[is.na(combined$BsmtFinType1)] <- "None"
combined$BsmtFinType2[is.na(combined$BsmtFinType2)] <- "None"
combined$BsmtQual <- as.factor(combined$BsmtQual)
combined$BsmtCond <- as.factor(combined$BsmtCond)
combined$BsmtExposure <- as.factor(combined$BsmtExposure)
combined$BsmtFinType1 <- as.factor(combined$BsmtFinType1)
combined$BsmtFinType2 <- as.factor(combined$BsmtFinType2)
# impute Electrical with mode
combined$Electrical <- as.character(combined$Electrical)
combined$Electrical[is.na(combined$Electrical)] <- "SBrkr"
combined$Electrical <- as.factor(combined$Electrical)
# impute KitchenQual with average (TA) based on the OverallQual
combined$KitchenQual <- as.character(combined$KitchenQual)
combined$KitchenQual[is.na(combined$KitchenQual)] <- "TA"
combined$KitchenQual <- as.factor(combined$KitchenQual)
# impute Functional with mode
combined$Functional <- as.character(combined$Functional)
combined$Functional[is.na(combined$Functional)] <- "Typ"
combined$Functional <- as.factor(combined$Functional)
# FireplaceQu
combined$FireplaceQu <- as.character(combined$FireplaceQu)
combined$FireplaceQu[is.na(combined$FireplaceQu)] <- "None"
combined$FireplaceQu <- as.factor(combined$FireplaceQu)
# Garage
combined$GarageFinish <- as.character(combined$GarageFinish)
combined$GarageQual <- as.character(combined$GarageQual)
combined$GarageCond <- as.character(combined$GarageCond)
combined$GarageFinish[is.na(combined$GarageFinish)] <- "None"
combined$GarageQual[is.na(combined$GarageQual)] <- "None"
combined$GarageCond[is.na(combined$GarageCond)] <- "None"
combined$GarageFinish <- as.factor(combined$GarageFinish)
combined$GarageQual <- as.factor(combined$GarageQual)
combined$GarageCond <- as.factor(combined$GarageCond)
# PoolQC
combined$PoolQC <- as.character(combined$PoolQC)
combined$PoolQC[is.na(combined$PoolQC)] <- "None"
combined$PoolQC <- as.factor(combined$PoolQC)
# Fence
combined$Fence <- as.character(combined$Fence)
combined$Fence[is.na(combined$Fence)] <- "None"
combined$Fence <- as.factor(combined$Fence)
# impute MSZoning with mode
combined$MSZoning <- as.character(combined$MSZoning)
combined$MSZoning[is.na(combined$MSZoning)] <- "RL"
combined$MSZoning <- as.factor(combined$MSZoning)
# impute Exterior1st and Exterior2nd with mode
combined$Exterior1st <- as.character(combined$Exterior1st)
combined$Exterior2nd <- as.character(combined$Exterior2nd)
combined$Exterior1st[is.na(combined$Exterior1st)] <- "VinylSd"
combined$Exterior2nd[is.na(combined$Exterior2nd)] <- "HdBoard"
combined$Exterior1st <- as.factor(combined$Exterior1st)
combined$Exterior2nd <- as.factor(combined$Exterior2nd)
# MasVnrType
combined$MasVnrType <- as.character(combined$MasVnrType)
combined$MasVnrType[is.na(combined$MasVnrType)] <- "None"
combined$MasVnrType <- as.factor(combined$MasVnrType)
# GarageType
combined$GarageType <- as.character(combined$GarageType)
combined$GarageType[is.na(combined$GarageType)] <- "None"
combined$GarageType <- as.factor(combined$GarageType)
# MiscFeature
combined$MiscFeature <- as.character(combined$MiscFeature)
combined$MiscFeature[is.na(combined$MiscFeature)] <- "None"
combined$MiscFeature <- as.factor(combined$MiscFeature)
# SaleType
combined$SaleType <- as.character(combined$SaleType)
combined$SaleType[is.na(combined$SaleType)] <- "Oth"
combined$SaleType <- as.factor(combined$SaleType)
train <- combined[!is.na(combined$SalePrice),]
test <- combined[is.na(combined$SalePrice),]
From above analysis, we found our best parameter for modeling this dataset and use them in below each model.
set.seed(421)
fold.index <- cut(sample(1:nrow(train)), breaks=10, labels=FALSE)
pred.error <- rep(0, 10)
for (i in 1:10) {
test.index <- which(fold.index==i)
sp.lm.mod <- lm(SalePrice~MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath
+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch,train[-test.index,])
pred.lm <- predict(sp.lm.mod, train[test.index, ])
true.lm <- train[test.index, ]$SalePrice
pred.error[i] <- mean((pred.lm-true.lm)^2)
}
# cv estimate
mse.lm <- mean(pred.error)
mse.lm
## [1] 0.01959431
sp.lm.mod <- lm(SalePrice~MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath
+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch,train)
pred.lm <- predict(sp.lm.mod, test)
pred.lm[which(is.na(pred.lm))] <- mean(pred.lm, na.rm = TRUE)
test.lm <- data.frame(Id = test.id, SalePrice = exp(pred.lm)-1)
head(test.lm,5)
## Id SalePrice
## 1461 1461 129581.6
## 1462 1462 156766.0
## 1463 1463 181371.1
## 1464 1464 205131.7
## 1465 1465 198316.5
write.csv(test.lm,file='Linear Regression Model Housing Price.csv',row.names = FALSE)
predict.regsubsets = function(object, newdata, id){
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id=id)
xvars = names(coefi)
mat[,xvars]%*%coefi
}
library(glmnet)
fit.best <- regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath
+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
data = train, nvmax = 27)
best.summary <- summary(fit.best)
which.min(best.summary$cp)
## [1] 24
which.min(best.summary$bic)
## [1] 21
which.max(best.summary$adjr2)
## [1] 24
par(mfrow = c(2, 2))
plot(best.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(24, best.summary$cp[24], pch = 4, col = "red", lwd = 7)
plot(best.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(21, best.summary$bic[21], pch = 4, col = "red", lwd = 7)
plot(best.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, type = "l")
points(25, best.summary$adjr2[25], pch = 4, col = "red", lwd = 7)
coef(fit.best, which.max(best.summary$adjr2))
## (Intercept) MSZoningFV MSZoningRH MSZoningRL MSZoningRM LotArea
## 0.518572979 0.389170744 0.359652251 0.324018606 0.286519770 0.085466357
## OverallQual OverallCond YearBuilt YearRemodAdd BsmtFinSF1 BsmtUnfSF
## 0.086852678 0.043388514 0.002278495 0.001186687 0.010678988 0.004855216
## X1stFlrSF X2ndFlrSF LowQualFinSF BsmtFullBath FullBath HalfBath
## 0.335034068 0.019963961 0.009472446 0.039000037 0.041368414 0.031215373
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageCars WoodDeckSF
## -0.015965582 -0.211327411 0.128039872 0.030903591 0.070370134 0.004130451
## ScreenPorch
## 0.009728612
According to the above summary, the best subset selection with cp choosing 24 variables, BIC choosing 21 variables and adjusted R^2 choosing 25 variables
k = 10
set.seed(123)
folds = sample(1:k, nrow(train), replace = TRUE)
cv.errors = matrix(NA, k, 27, dimnames = list(NULL, paste(1:27)))
for(j in 1:k){
fit.best = regsubsets(SalePrice ~
MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch, data = train[folds != j,],
nvmax=27)
for (i in 1:27){
pred = predict.regsubsets(fit.best, train[folds == j, ], id = i)
cv.errors[j, i] = mean((train$SalePrice[folds == j] - pred)^2)
}
}
mean.cv.errors = apply(cv.errors, 2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7
## 0.05334174 0.04197162 0.03821557 0.03119303 0.02739346 0.02469638 0.02423254
## 8 9 10 11 12 13 14
## 0.02185710 0.02229178 0.02142727 0.02157552 0.02109429 0.02156106 0.02132758
## 15 16 17 18 19 20 21
## 0.02122264 0.02066810 0.02037142 0.02071274 0.02073043 0.02028587 0.02003563
## 22 23 24 25 26 27
## 0.01982793 0.01972963 0.01957019 0.01956300 0.01955967 0.01957147
mse.best <- min(mean.cv.errors)
names(which.min(mean.cv.errors))
## [1] "26"
fit.best = regsubsets(SalePrice ~
MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
train, nvmax = 26)
test.mat = model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+
BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+
BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch, test)
coefi = coef(fit.best, id = 26)
pred.best = test.mat[,names(coefi)]%*%coefi
pred.best = data.frame(exp(pred.best)-1)
test.best <- data.frame(Id = test.id, SalePrice = pred.best[, 1])
head(test.best, 5)
## Id SalePrice
## 1 1461 129653.2
## 2 1462 156881.7
## 3 1463 181505.8
## 4 1464 205273.5
## 5 1465 198391.2
write.csv(test.best,file='Best Subset selection Housing Price.csv',row.names = FALSE)
fit.fwd <- regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
data = train, nvmax = 27, method='forward')
fwd.summary <- summary(fit.fwd)
which.min(fwd.summary$cp)
## [1] 24
which.min(fwd.summary$bic)
## [1] 21
which.max(fwd.summary$adjr2)
## [1] 24
par(mfrow = c(2, 2))
plot(fwd.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(24, fwd.summary$cp[24], pch = 4, col = "red", lwd = 7)
plot(fwd.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(21, fwd.summary$bic[21], pch = 4, col = "red", lwd = 7)
plot(fwd.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, type = "l")
points(25, fwd.summary$adjr2[25], pch = 4, col = "red", lwd = 7)
coef(fit.fwd, which.max(fwd.summary$adjr2))
## (Intercept) MSZoningFV MSZoningRH MSZoningRL MSZoningRM LotArea
## 0.518572979 0.389170744 0.359652251 0.324018606 0.286519770 0.085466357
## OverallQual OverallCond YearBuilt YearRemodAdd BsmtFinSF1 BsmtUnfSF
## 0.086852678 0.043388514 0.002278495 0.001186687 0.010678988 0.004855216
## X1stFlrSF X2ndFlrSF LowQualFinSF BsmtFullBath FullBath HalfBath
## 0.335034068 0.019963961 0.009472446 0.039000037 0.041368414 0.031215373
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageCars WoodDeckSF
## -0.015965582 -0.211327411 0.128039872 0.030903591 0.070370134 0.004130451
## ScreenPorch
## 0.009728612
k = 10
set.seed(123)
folds = sample(1:k, nrow(train), replace = TRUE)
cv.errors = matrix(NA, k, 27, dimnames = list(NULL, paste(1:27)))
for(j in 1:k){
fit.fwd = regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
data = train[folds != j,], nvmax = 27, method = "forward")
for (i in 1:27){
pred = predict.regsubsets(fit.fwd, train[folds == j, ], id = i)
cv.errors[j, i] = mean((train$SalePrice[folds == j] - pred)^2)
}
}
mean.cv.errors = apply(cv.errors, 2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7
## 0.05334174 0.04197162 0.03821557 0.03353788 0.03138078 0.02766656 0.02608801
## 8 9 10 11 12 13 14
## 0.02350706 0.02278304 0.02117378 0.02102577 0.02106928 0.02077449 0.02065817
## 15 16 17 18 19 20 21
## 0.02052812 0.02039953 0.02028935 0.02028194 0.02029586 0.02026476 0.02038669
## 22 23 24 25 26 27
## 0.02013680 0.01974959 0.01957019 0.01956300 0.01955967 0.01957147
mse.fwd <- min(mean.cv.errors)
names(which.min(mean.cv.errors))
## [1] "26"
fit.fwd = regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train, nvmax = 26, method = "forward")
test.mat = model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch, test)
coefi = coef(fit.fwd, id = 26)
pred.fwd = test.mat[,names(coefi)]%*%coefi
pred.fwd = data.frame(exp(pred.fwd)-1)
test.fwd <- data.frame(Id = test.id, SalePrice = pred.fwd[, 1])
head(test.fwd, 5)
## Id SalePrice
## 1 1461 129653.2
## 2 1462 156881.7
## 3 1463 181505.8
## 4 1464 205273.5
## 5 1465 198391.2
write.csv(test.fwd,file='Forward Selection Housing Price.csv',row.names = FALSE)
fit.bwd <- regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch, data = train, nvmax = 27, method = 'backward')
bwd.summary <- summary(fit.bwd)
which.min(bwd.summary$cp)
## [1] 24
which.min(bwd.summary$bic)
## [1] 21
which.max(bwd.summary$adjr2)
## [1] 24
par(mfrow = c(2, 2))
plot(bwd.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(24, bwd.summary$cp[24], pch = 4, col = "red", lwd = 7)
plot(bwd.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(21, bwd.summary$bic[21], pch = 4, col = "red", lwd = 7)
plot(bwd.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20, type = "l")
points(25, bwd.summary$adjr2[25], pch = 4, col = "red", lwd = 7)
coef(fit.bwd, which.max(bwd.summary$adjr2))
## (Intercept) MSZoningFV MSZoningRH MSZoningRL MSZoningRM LotArea
## 0.518572979 0.389170744 0.359652251 0.324018606 0.286519770 0.085466357
## OverallQual OverallCond YearBuilt YearRemodAdd BsmtFinSF1 BsmtUnfSF
## 0.086852678 0.043388514 0.002278495 0.001186687 0.010678988 0.004855216
## X1stFlrSF X2ndFlrSF LowQualFinSF BsmtFullBath FullBath HalfBath
## 0.335034068 0.019963961 0.009472446 0.039000037 0.041368414 0.031215373
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageCars WoodDeckSF
## -0.015965582 -0.211327411 0.128039872 0.030903591 0.070370134 0.004130451
## ScreenPorch
## 0.009728612
k = 10
set.seed(123)
folds = sample(1:k, nrow(train), replace = TRUE)
cv.errors = matrix(NA, k, 27, dimnames = list(NULL, paste(1:27)))
for(j in 1:k){
best.fit = regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch, data = train[folds != j,], nvmax = 27, method = "backward")
for (i in 1:27){
pred = predict.regsubsets(best.fit, train[folds == j, ], id = i)
cv.errors[j, i] = mean((train$SalePrice[folds == j] - pred)^2)
}
}
mean.cv.errors = apply(cv.errors, 2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7
## 0.05334174 0.04197162 0.03700267 0.03119303 0.02739346 0.02469638 0.02423254
## 8 9 10 11 12 13 14
## 0.02228972 0.02222079 0.02190389 0.02175932 0.02209848 0.02208304 0.02113129
## 15 16 17 18 19 20 21
## 0.02118976 0.02112810 0.02091671 0.02044549 0.02034798 0.02020231 0.02003563
## 22 23 24 25 26 27
## 0.01982793 0.01972963 0.01957019 0.01956300 0.01955967 0.01957147
mse.bwd <- min(mean.cv.errors)
names(which.min(mean.cv.errors))
## [1] "26"
fit.bwd = regsubsets(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+
GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train, nvmax = 26, method = 'backward')
test.mat = model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+
TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch, test)
coefi = coef(fit.bwd, id = 26)
pred.bwd = test.mat[,names(coefi)]%*%coefi
pred.bwd = data.frame(exp(pred.bwd)-1)
test.bwd <- data.frame(Id = test.id, SalePrice = pred.bwd[, 1])
head(test.bwd, 5)
## Id SalePrice
## 1 1461 129653.2
## 2 1462 156881.7
## 3 1463 181505.8
## 4 1464 205273.5
## 5 1465 198391.2
write.csv(test.bwd,file='Backward Selection Housing Price.csv',row.names = FALSE)
set.seed(123)
train.index = sample(nrow(train), 1000)
test.index = -train.index
ridge.train = train[train.index, ]
ridge.test = train[test.index, ]
x.train = model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+
BsmtFullBath+BsmtHalfBath+FullBath+
HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+
WoodDeckSF+ScreenPorch, ridge.train)[,-1]
x.test = model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+
BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+
KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
ridge.test)[,-1]
grid = 10^seq(10, -2, length = 100)
fit.ridge = cv.glmnet(x.train, ridge.train$SalePrice, alpha = 0, lambda = grid, thresh = 1e-12)
lambda = fit.ridge$lambda.min
pred.ridge = predict(fit.ridge, newx = x.test, s = lambda)
# cv estimate
mse.ridge <- mean((ridge.test$SalePrice-pred.ridge)^2)
mse.ridge
## [1] 0.01770622
set.seed(123)
x.train <- model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+
HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train)[,-1]
x.test <- model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces
+GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
test)[,-1]
grid <- 10^seq(10, -2, length = 100)
fit.ridge <- cv.glmnet(x.train, train$SalePrice, alpha = 0, lambda = grid, thresh = 1e-12)
lambda <- fit.ridge$lambda.min
pred.ridge <- data.frame(exp(predict(fit.ridge, newx = x.test, s = lambda)) - 1)
test.ridge <- data.frame(Id = test.id, SalePrice = pred.ridge[, 1])
head(test.ridge, 5)
## Id SalePrice
## 1 1461 129555.9
## 2 1462 156119.4
## 3 1463 181063.7
## 4 1464 205003.7
## 5 1465 197330.6
write.csv(test.ridge,file='Shrinkage-Ridge Housing Price.csv',row.names = FALSE)
set.seed(123)
train.index = sample(nrow(train), 1000)
test.index = -train.index
lasso.train = train[train.index, ]
lasso.test = train[test.index, ]
x.train = model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+
BsmtFullBath+BsmtHalfBath+FullBath+
HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+
WoodDeckSF+ScreenPorch, lasso.train)[,-1]
x.test = model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+
BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+
KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
lasso.test)[,-1]
grid = 10^seq(10, -2, length = 100)
fit.lasso = cv.glmnet(x.train, lasso.train$SalePrice, alpha = 1, lambda = grid, thresh = 1e-12)
lambda = fit.lasso$lambda.min
pred.lasso = predict(fit.lasso, newx = x.test, s = lambda)
# cv estimate
mse.lasso <- mean((lasso.test$SalePrice-pred.lasso)^2)
mse.lasso
## [1] 0.01959865
set.seed(123)
x.train <- model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+
HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train)[,-1]
x.test <- model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces
+GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
test)[,-1]
grid <- 10^seq(10, -2, length = 100)
fit.lasso <- cv.glmnet(x.train, train$SalePrice, alpha = 1, lambda = grid, thresh = 1e-12)
lambda <- fit.lasso$lambda.min
pred.lasso <- data.frame(exp(predict(fit.lasso, newx = x.test, s = lambda)) - 1)
test.lasso <- data.frame(Id = test.id, SalePrice = pred.lasso[, 1])
head(test.lasso, 5)
## Id SalePrice
## 1 1461 122247.7
## 2 1462 156569.4
## 3 1463 175096.9
## 4 1464 197648.4
## 5 1465 199438.1
write.csv(test.lasso,file='Shrinkage-lasso Housing Price.csv',row.names = FALSE)
Firstly, we use smooth.spline to find out the best df for each variables in gam model.
smooth.spline(train$SalePrice,train$LotArea,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$LotArea, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$LotArea, cv = TRUE)
##
## Smoothing Parameter spar= 1.034062 lambda= 0.01039059 (11 iterations)
## Equivalent Degrees of Freedom (Df): 6.634186
## Penalized Criterion (RSS): 144.8477
## PRESS(l.o.o. CV): 0.2247289
smooth.spline(train$SalePrice,train$OverallQual,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$OverallQual, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$OverallQual, cv = TRUE)
##
## Smoothing Parameter spar= 1.024431 lambda= 0.008842534 (9 iterations)
## Equivalent Degrees of Freedom (Df): 6.879738
## Penalized Criterion (RSS): 405.4562
## PRESS(l.o.o. CV): 0.616644
smooth.spline(train$SalePrice,train$OverallCond,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$OverallCond, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$OverallCond, cv = TRUE)
##
## Smoothing Parameter spar= 0.9826638 lambda= 0.0044139 (11 iterations)
## Equivalent Degrees of Freedom (Df): 8.044889
## Penalized Criterion (RSS): 728.3422
## PRESS(l.o.o. CV): 1.158312
smooth.spline(train$SalePrice,train$YearBuilt,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$YearBuilt, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$YearBuilt, cv = TRUE)
##
## Smoothing Parameter spar= 0.8557543 lambda= 0.0005344867 (11 iterations)
## Equivalent Degrees of Freedom (Df): 12.9412
## Penalized Criterion (RSS): 344826.4
## PRESS(l.o.o. CV): 556.9464
smooth.spline(train$SalePrice,train$YearRemodAdd,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$YearRemodAdd, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$YearRemodAdd, cv = TRUE)
##
## Smoothing Parameter spar= 1.004997 lambda= 0.006399878 (12 iterations)
## Equivalent Degrees of Freedom (Df): 7.399341
## Penalized Criterion (RSS): 157106.9
## PRESS(l.o.o. CV): 278.8817
smooth.spline(train$SalePrice,train$BsmtFinSF1,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$BsmtFinSF1, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$BsmtFinSF1, cv = TRUE)
##
## Smoothing Parameter spar= 0.7910247 lambda= 0.0001820888 (11 iterations)
## Equivalent Degrees of Freedom (Df): 16.51093
## Penalized Criterion (RSS): 5662.935
## PRESS(l.o.o. CV): 8.324703
smooth.spline(train$SalePrice,train$BsmtFinSF2,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$BsmtFinSF2, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$BsmtFinSF2, cv = TRUE)
##
## Smoothing Parameter spar= 1.043565 lambda= 0.01215677 (10 iterations)
## Equivalent Degrees of Freedom (Df): 6.403602
## Penalized Criterion (RSS): 1939.963
## PRESS(l.o.o. CV): 3.382815
smooth.spline(train$SalePrice,train$BsmtUnfSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$BsmtUnfSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$BsmtUnfSF, cv = TRUE)
##
## Smoothing Parameter spar= 1.152389 lambda= 0.07439161 (12 iterations)
## Equivalent Degrees of Freedom (Df): 4.261831
## Penalized Criterion (RSS): 1881.646
## PRESS(l.o.o. CV): 3.290543
smooth.spline(train$SalePrice,train$X1stFlrSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$X1stFlrSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$X1stFlrSF, cv = TRUE)
##
## Smoothing Parameter spar= 1.208518 lambda= 0.1892522 (18 iterations)
## Equivalent Degrees of Freedom (Df): 3.47602
## Penalized Criterion (RSS): 41.84421
## PRESS(l.o.o. CV): 0.06338194
smooth.spline(train$SalePrice,train$X2ndFlrSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$X2ndFlrSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$X2ndFlrSF, cv = TRUE)
##
## Smoothing Parameter spar= 0.9591777 lambda= 0.002986355 (11 iterations)
## Equivalent Degrees of Freedom (Df): 8.784485
## Penalized Criterion (RSS): 7003.273
## PRESS(l.o.o. CV): 10.43444
smooth.spline(train$SalePrice,train$LowQualFinSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$LowQualFinSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$LowQualFinSF, cv = TRUE)
##
## Smoothing Parameter spar= 1.163104 lambda= 0.08890663 (12 iterations)
## Equivalent Degrees of Freedom (Df): 4.096689
## Penalized Criterion (RSS): 348.3476
## PRESS(l.o.o. CV): 0.5581096
smooth.spline(train$SalePrice,train$BedroomAbvGr,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$BedroomAbvGr, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$BedroomAbvGr, cv = TRUE)
##
## Smoothing Parameter spar= 1.163691 lambda= 0.08967998 (16 iterations)
## Equivalent Degrees of Freedom (Df): 4.088852
## Penalized Criterion (RSS): 435.4369
## PRESS(l.o.o. CV): 0.6303175
smooth.spline(train$SalePrice,train$KitchenAbvGr,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$KitchenAbvGr, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$KitchenAbvGr, cv = TRUE)
##
## Smoothing Parameter spar= 0.8159963 lambda= 0.0002758647 (14 iterations)
## Equivalent Degrees of Freedom (Df): 15.02772
## Penalized Criterion (RSS): 4.376797
## PRESS(l.o.o. CV): 0.007769115
smooth.spline(train$SalePrice,train$TotRmsAbvGrd,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$TotRmsAbvGrd, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$TotRmsAbvGrd, cv = TRUE)
##
## Smoothing Parameter spar= 1.324724 lambda= 1.307238 (18 iterations)
## Equivalent Degrees of Freedom (Df): 2.433439
## Penalized Criterion (RSS): 20.52923
## PRESS(l.o.o. CV): 0.03258072
smooth.spline(train$SalePrice,train$GarageArea,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$GarageArea, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$GarageArea, cv = TRUE)
##
## Smoothing Parameter spar= 1.054994 lambda= 0.01470245 (12 iterations)
## Equivalent Degrees of Freedom (Df): 6.134974
## Penalized Criterion (RSS): 16805833
## PRESS(l.o.o. CV): 26261.65
smooth.spline(train$SalePrice,train$WoodDeckSF,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$WoodDeckSF, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$WoodDeckSF, cv = TRUE)
##
## Smoothing Parameter spar= 0.8717063 lambda= 0.0006969245 (15 iterations)
## Equivalent Degrees of Freedom (Df): 12.1895
## Penalized Criterion (RSS): 3873.201
## PRESS(l.o.o. CV): 5.933387
smooth.spline(train$SalePrice,train$ScreenPorch,cv=TRUE)
## Warning in smooth.spline(train$SalePrice, train$ScreenPorch, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
## Call:
## smooth.spline(x = train$SalePrice, y = train$ScreenPorch, cv = TRUE)
##
## Smoothing Parameter spar= 0.7890874 lambda= 0.0001765097 (10 iterations)
## Equivalent Degrees of Freedom (Df): 16.62785
## Penalized Criterion (RSS): 1146.012
## PRESS(l.o.o. CV): 1.930691
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.16.1
sp.gam <- gam(SalePrice~ MSZoning+s(LotArea,df=7)+s(OverallQual,df=7)+s(OverallCond,df=8)+s(YearBuilt,df=13)+s(YearRemodAdd,df=7)+s(BsmtFinSF1,df=16)+s(BsmtFinSF2,df=2)+s(BsmtUnfSF,df=4)+s(X1stFlrSF,df=3)+s(X2ndFlrSF,df=9)+s(LowQualFinSF,df=4)+BsmtFullBath+BsmtHalfBath+FullBath+HalfBath+s(BedroomAbvGr,df=4)+s(KitchenAbvGr,df=15)+s(TotRmsAbvGrd,df=2)+Fireplaces+GarageCars+s(GarageArea,df=6)+s(WoodDeckSF,df=12)+s(ScreenPorch,df=17),data=train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
plot(sp.gam,se=TRUE,col='blue')
train.gam <- predict(sp.gam,train)
mse.gam <- mean((train.gam-train$SalePrice)^2)
mse.gam
## [1] 0.01229514
yhat.gam <- predict(sp.gam,newdata = test)
test.gam <- data.frame(Id=test.id,SalePrice=exp(yhat.gam)-1)
head(test.gam,5)
## Id SalePrice
## 1461 1461 122587.6
## 1462 1462 159071.8
## 1463 1463 186595.6
## 1464 1464 204815.7
## 1465 1465 189920.9
write.csv(test.gam,file = 'GAM Model Housing Price.csv',row.names = FALSE)
Firstly, we use all the variables we use before to create a unpruned tree model.
library(tree)
sp.tree <- tree(SalePrice~.,data=train)
summary(sp.tree)
##
## Regression tree:
## tree(formula = SalePrice ~ ., data = train)
## Variables actually used in tree construction:
## [1] "OverallQual" "Neighborhood" "GrLivArea"
## Number of terminal nodes: 10
## Residual mean deviance: 0.04059 = 58.86 / 1450
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.133000 -0.108100 0.006889 0.000000 0.116900 0.795200
Tree model MSE for training: 0.04059 Terminal Nodes: 10
plot(sp.tree)
text(sp.tree,pretty=0)
train.tree <- predict(sp.tree,train)
mse.tree <- mean((train.tree-train$SalePrice)^2)
mse.tree
## [1] 0.04031398
sp.tree.pred <- predict(sp.tree,newdata=test)
sp.tree.table <- data.frame(Id=test.id,SalePrice=exp(sp.tree.pred)-1)
head(sp.tree.table,5)
## Id SalePrice
## 1461 1461 129032.3
## 1462 1462 150665.9
## 1463 1463 183795.7
## 1464 1464 183795.7
## 1465 1465 267584.7
write.csv(sp.tree.table,file='Tree Model Housing Price.csv',row.names = FALSE)
set.seed(422)
cv.bal <- cv.tree(sp.tree,K=10)
best.size <- cv.bal$size[which.min(cv.bal$dev)]
best.size
## [1] 10
Since the cross-validation we ran indicate that the best terminal node size for tree model is 10 which is the same as unpruned tree, we might just choose unpruned tree as our tree regression method.
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
sp.bag <- randomForest(SalePrice ~.,data=train,mtry=78,importance=TRUE,ntree=1000)
importance(sp.bag)
## %IncMSE IncNodePurity
## MSSubClass 31.67601564 3.084394e+00
## MSZoning 9.00012614 8.706961e-01
## LotFrontage 6.33728592 7.227263e-01
## LotArea 21.49942366 1.956873e+00
## Street 0.16561912 4.742735e-03
## Alley 3.98995513 1.551027e-01
## LotShape 4.30997733 1.037558e-01
## LandContour -0.62431221 2.135910e-01
## LotConfig 1.07681703 1.441433e-01
## LandSlope 1.12480449 7.026129e-02
## Neighborhood 67.92679121 2.393360e+01
## Condition1 2.14773234 1.849033e-01
## Condition2 0.11759084 1.087263e-02
## BldgType 8.14326005 8.681719e-02
## HouseStyle 5.23765596 1.509648e-01
## OverallQual 109.80356453 1.232858e+02
## OverallCond 26.32059779 1.630250e+00
## YearBuilt 16.27998482 9.648326e-01
## YearRemodAdd 15.94919289 1.122072e+00
## RoofStyle 1.29265881 1.469567e-01
## RoofMatl -0.33707188 9.990567e-03
## Exterior1st 10.74289088 1.122394e+00
## Exterior2nd 12.92418890 1.477688e+00
## MasVnrType 5.25251570 1.027600e-01
## MasVnrArea 8.84340937 4.365222e-01
## ExterQual 9.74880431 2.472902e-01
## ExterCond 4.57986292 4.294897e-01
## Foundation 6.94048303 1.608849e-01
## BsmtQual 8.92136532 5.924880e-01
## BsmtCond 9.71415625 3.325351e-01
## BsmtExposure 9.12934516 3.803916e-01
## BsmtFinType1 21.27698860 1.218419e+00
## BsmtFinSF1 36.10710742 3.568465e+00
## BsmtFinType2 -0.86540529 1.531030e-01
## BsmtFinSF2 1.16343327 5.072025e-02
## BsmtUnfSF 12.86031058 8.364759e-01
## TotalBsmtSF 47.81515257 6.633855e+00
## Heating -2.10018188 4.671189e-02
## HeatingQC 5.08458067 3.614053e-01
## CentralAir 9.49644290 1.675958e+00
## Electrical -2.97826096 1.169865e-01
## X1stFlrSF 27.85335342 4.176168e+00
## X2ndFlrSF 18.45915351 9.548607e-01
## LowQualFinSF -1.73064696 2.739029e-02
## GrLivArea 97.76838153 2.399348e+01
## BsmtFullBath 12.39139506 2.598507e-01
## BsmtHalfBath 0.47808751 1.716792e-02
## FullBath 7.67993928 2.487244e-01
## HalfBath 9.21884664 1.198790e-01
## BedroomAbvGr 9.03889360 2.262668e-01
## KitchenAbvGr 5.49101435 6.499158e-02
## KitchenQual 15.83004927 5.590770e-01
## TotRmsAbvGrd 7.02945838 4.819138e-01
## Functional 1.89288660 2.827865e-01
## Fireplaces 7.60673353 2.169386e-01
## FireplaceQu 11.74247569 7.703773e-01
## GarageType 18.88794021 1.520622e+00
## GarageYrBlt 11.98895474 7.431595e-01
## GarageFinish 14.42021789 5.658529e-01
## GarageCars 20.98181564 6.264109e+00
## GarageArea 23.78222178 4.349662e+00
## GarageQual 9.09588979 3.384324e-01
## GarageCond 0.04553057 2.783718e-01
## PavedDrive 2.06143311 1.534337e-01
## WoodDeckSF 6.30438075 3.916525e-01
## OpenPorchSF 10.73946847 6.742840e-01
## EnclosedPorch 3.85584688 3.077440e-01
## X3SsnPorch 0.24708882 2.171863e-02
## ScreenPorch 4.32185749 8.813215e-02
## PoolArea 1.69518703 1.027626e-02
## PoolQC -1.12050999 8.119013e-03
## Fence -1.54378156 2.195953e-01
## MiscFeature -1.11821662 1.368547e-02
## MiscVal -0.65944586 2.577852e-02
## MoSold 1.22088600 3.363362e+00
## YrSold 0.20176443 5.661495e-01
## SaleType 0.59498049 1.387814e-01
## SaleCondition -2.26924391 5.023724e-01
varImpPlot(sp.bag)
From the above summary, we can conclude that predictor OverallQual is the most important predictor among all since the %IncMSE parameter is the largest among all.
train.bag <- predict(sp.bag,train)
mse.bag <- mean((train.bag-train$SalePrice)^2)
mse.bag
## [1] 0.003282359
sp.bag.pred <- predict(sp.bag,newdata=test)
sp.bag.table <- data.frame(Id=test.id,SalePrice=exp(sp.bag.pred)-1)
head(sp.bag.table,5)
## Id SalePrice
## 1461 1461 125441.2
## 1462 1462 155426.5
## 1463 1463 184248.1
## 1464 1464 182322.5
## 1465 1465 192991.7
write.csv(sp.bag.table,file = 'Bagging Method Housing Price.csv',row.names = FALSE)
set.seed(422)
sp.rf <- randomForest(SalePrice~.,data=train,mtry=round(sqrt(78)),importance=TRUE,ntree=1000)
importance(sp.rf)
## %IncMSE IncNodePurity
## MSSubClass 25.84482350 4.97654145
## MSZoning 12.90308438 1.39427090
## LotFrontage 13.36020269 2.02244805
## LotArea 24.07461188 3.71553730
## Street -1.14816391 0.02675323
## Alley 3.39807288 0.17447754
## LotShape 6.74512237 0.34872657
## LandContour 4.66015037 0.36879936
## LotConfig 0.64077209 0.27122050
## LandSlope 2.96096932 0.20929149
## Neighborhood 33.19627548 21.88434565
## Condition1 5.00183105 0.33181593
## Condition2 -0.21995220 0.04172898
## BldgType 10.76362487 0.38977054
## HouseStyle 14.50711296 0.89750520
## OverallQual 27.20118282 23.46778903
## OverallCond 17.49448212 1.55271957
## YearBuilt 16.67909845 8.50636728
## YearRemodAdd 16.19349570 4.25784547
## RoofStyle 5.16274436 0.45014136
## RoofMatl -0.04126856 0.12942735
## Exterior1st 13.22455265 1.87670095
## Exterior2nd 12.63420918 1.85948965
## MasVnrType 8.64297207 0.44084464
## MasVnrArea 13.09124065 1.43312465
## ExterQual 16.74938610 11.61587272
## ExterCond 0.29241908 0.41529118
## Foundation 8.94845843 2.39261832
## BsmtQual 13.94192419 6.77671953
## BsmtCond 8.94329746 0.45382721
## BsmtExposure 10.61186162 0.79766511
## BsmtFinType1 15.18664669 1.75034938
## BsmtFinSF1 25.24737834 3.74003664
## BsmtFinType2 4.26076186 0.30063702
## BsmtFinSF2 2.53430221 0.16736087
## BsmtUnfSF 12.04060622 1.13223991
## TotalBsmtSF 27.58258919 9.51059839
## Heating -0.90739233 0.17934422
## HeatingQC 9.29688689 0.88920508
## CentralAir 10.38259155 1.43999545
## Electrical -0.36878717 0.23319167
## X1stFlrSF 28.16320346 7.64389466
## X2ndFlrSF 27.71642487 3.50542009
## LowQualFinSF -2.24360250 0.06166294
## GrLivArea 36.80667642 21.60881653
## BsmtFullBath 11.07122820 0.44029561
## BsmtHalfBath 2.96084588 0.06785438
## FullBath 14.82919804 5.74872241
## HalfBath 11.93685927 0.55509351
## BedroomAbvGr 13.03808889 1.15412249
## KitchenAbvGr 6.29987235 0.15217718
## KitchenQual 16.63329840 9.86004616
## TotRmsAbvGrd 19.38180601 3.19921199
## Functional 3.47529593 0.32153561
## Fireplaces 16.94473451 3.02607001
## FireplaceQu 20.12083415 5.08909125
## GarageType 14.78646382 4.62269838
## GarageYrBlt 14.41089256 6.08684378
## GarageFinish 13.46340029 5.75787211
## GarageCars 19.01313159 8.68122707
## GarageArea 21.98127382 8.95913597
## GarageQual 5.76043590 0.94502773
## GarageCond 4.56305817 1.03945544
## PavedDrive 5.56635847 0.43625239
## WoodDeckSF 8.27982595 0.76990639
## OpenPorchSF 13.43964706 1.46184375
## EnclosedPorch 0.68257099 0.29622156
## X3SsnPorch -0.46586811 0.03532769
## ScreenPorch 4.86860341 0.20310578
## PoolArea -1.80100631 0.03932974
## PoolQC -1.57508949 0.03928185
## Fence 1.69497692 0.37383928
## MiscFeature 0.42735380 0.05698580
## MiscVal 0.54841276 0.07306551
## MoSold 0.47324469 2.33289778
## YrSold -0.13197481 0.73029816
## SaleType 2.09530687 0.24001524
## SaleCondition 2.65442366 0.61948718
varImpPlot(sp.rf)
It seems like GrLivingArea predictor is the most important predictors all.
train.random <- predict(sp.rf,train)
mse.random <- mean((train.random-train$SalePrice)^2)
mse.random
## [1] 0.003662686
sp.rf.pred <- predict(sp.rf,newdata=test)
sp.rf.table <- data.frame(Id=test.id,SalePrice=exp(sp.rf.pred)-1)
head(sp.bag.table,5)
## Id SalePrice
## 1461 1461 125441.2
## 1462 1462 155426.5
## 1463 1463 184248.1
## 1464 1464 182322.5
## 1465 1465 192991.7
write.csv(sp.rf.table,file='RandomForest Housing Price.csv',row.names = FALSE)
library(gbm)
## Loaded gbm 2.1.5
sp.gbm <- gbm(SalePrice~.,data=train,distribution='gaussian',shrinkage=0.01,n.tree=1000,interaction.depth=10,cv.folds=10)
summary(sp.gbm)
## var rel.inf
## OverallQual OverallQual 33.112984707
## Neighborhood Neighborhood 16.263931578
## GrLivArea GrLivArea 13.751506507
## TotalBsmtSF TotalBsmtSF 3.472304640
## KitchenQual KitchenQual 2.915461002
## ExterQual ExterQual 2.832610552
## X1stFlrSF X1stFlrSF 2.533205709
## GarageCars GarageCars 2.368344382
## BsmtFinSF1 BsmtFinSF1 2.222754203
## GarageArea GarageArea 2.144671998
## MSSubClass MSSubClass 1.884125259
## OverallCond OverallCond 1.421649421
## MoSold MoSold 1.214812103
## LotArea LotArea 1.189171617
## YearRemodAdd YearRemodAdd 1.136942584
## CentralAir CentralAir 1.104814835
## SaleCondition SaleCondition 0.864849303
## FireplaceQu FireplaceQu 0.749052240
## GarageType GarageType 0.622461508
## Exterior1st Exterior1st 0.600101140
## BsmtFinType1 BsmtFinType1 0.520584175
## Exterior2nd Exterior2nd 0.501954032
## YearBuilt YearBuilt 0.470268608
## X2ndFlrSF X2ndFlrSF 0.462972747
## GarageCond GarageCond 0.402289088
## GarageFinish GarageFinish 0.377810787
## BsmtQual BsmtQual 0.333529334
## BsmtExposure BsmtExposure 0.327771896
## GarageYrBlt GarageYrBlt 0.287855940
## MSZoning MSZoning 0.269561427
## Functional Functional 0.248397224
## OpenPorchSF OpenPorchSF 0.244466212
## TotRmsAbvGrd TotRmsAbvGrd 0.244010579
## FullBath FullBath 0.219065074
## Condition1 Condition1 0.201544795
## LotFrontage LotFrontage 0.181715545
## BsmtUnfSF BsmtUnfSF 0.171245718
## WoodDeckSF WoodDeckSF 0.158543730
## GarageQual GarageQual 0.157368687
## Fireplaces Fireplaces 0.149250165
## PavedDrive PavedDrive 0.146599939
## ExterCond ExterCond 0.137079142
## BsmtCond BsmtCond 0.132023354
## YrSold YrSold 0.128732966
## ScreenPorch ScreenPorch 0.120760848
## HeatingQC HeatingQC 0.118627921
## BsmtFullBath BsmtFullBath 0.112982757
## SaleType SaleType 0.097129153
## LandContour LandContour 0.088556044
## LotConfig LotConfig 0.076200250
## MasVnrArea MasVnrArea 0.067747047
## Fence Fence 0.067597036
## EnclosedPorch EnclosedPorch 0.058857470
## BedroomAbvGr BedroomAbvGr 0.043942814
## HalfBath HalfBath 0.042158707
## RoofMatl RoofMatl 0.036849322
## BsmtFinType2 BsmtFinType2 0.033596645
## Foundation Foundation 0.028504070
## Electrical Electrical 0.028106998
## RoofStyle RoofStyle 0.014640387
## BsmtFinSF2 BsmtFinSF2 0.010801418
## KitchenAbvGr KitchenAbvGr 0.010034896
## Alley Alley 0.009943481
## LotShape LotShape 0.009815651
## HouseStyle HouseStyle 0.009354410
## MasVnrType MasVnrType 0.009061307
## X3SsnPorch X3SsnPorch 0.006215119
## LowQualFinSF LowQualFinSF 0.005383488
## LandSlope LandSlope 0.004459384
## BsmtHalfBath BsmtHalfBath 0.002384267
## MiscVal MiscVal 0.002240571
## BldgType BldgType 0.001495228
## Heating Heating 0.001111440
## MiscFeature MiscFeature 0.001045421
## Street Street 0.000000000
## Condition2 Condition2 0.000000000
## PoolArea PoolArea 0.000000000
## PoolQC PoolQC 0.000000000
From above summary table we can conclude that OverallQual is the most important predictor among all since its relative influence is 34.9 which larger than any of other predictors.
train.gbm <- predict(sp.gbm,train)
## Using 880 trees...
mse.gbm <- mean((train.gbm-train$SalePrice)^2)
mse.gbm
## [1] 0.004839912
sp.gbm.yhat <- predict(sp.gbm,newdata=test,n.trees=which.min(sp.gbm$cv.error))
sp.gbm.table <- data.frame(Id=test.id,SalePrice=exp(sp.gbm.yhat)-1)
head(sp.gbm.table,5)
## Id SalePrice
## 1 1461 122068.0
## 2 1462 159122.3
## 3 1463 183513.2
## 4 1464 186705.9
## 5 1465 193232.6
write.csv(sp.gbm.table,file='Boosting Housing Price.csv',row.names = FALSE)
set.seed(123)
x <- model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+
HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch, train)[, -1]
y <- train[, length(train)]
fold.index <- cut(sample(1:nrow(train)), breaks=10, labels=FALSE)
k.value <- c(1,50,100)
error.k <- rep(0, length(k.value))
counter <- 0
for(k in k.value){
counter <- counter + 1
error <- 0
for(i in 1:10){
pred.knn <- knn(x[fold.index!=i,], x[fold.index==i,], y[fold.index!=i], k=k)
error <- error + sum(pred.knn != y[fold.index==i])
}
error.k[counter] <- error/nrow(train)
}
print(error.k)
## [1] 0.9958904 0.9958904 0.9904110
set.seed(123)
x.train <- model.matrix(SalePrice ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+
YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+
HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch,train)[,-1]
x.test <- model.matrix( ~ MSZoning+LotArea+OverallQual+OverallCond+YearBuilt+ YearRemodAdd+BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+X1stFlrSF+
X2ndFlrSF+LowQualFinSF+BsmtFullBath+BsmtHalfBath+FullBath+ HalfBath+BedroomAbvGr+KitchenAbvGr+TotRmsAbvGrd+Fireplaces+ GarageCars+GarageArea+WoodDeckSF+ScreenPorch,
test)[,-1]
pred.knn <- knn(x.train, x.test, train$SalePrice, k=10)
pred.knn <- exp(as.numeric(as.character(pred.knn)))-1
test.knn <- data.frame(Id = test.id, SalePrice = pred.knn)
head(test.knn, 5)
## Id SalePrice
## 1 1461 302000
## 2 1462 97000
## 3 1463 172500
## 4 1464 192000
## 5 1465 173000
write.csv(test.knn,file='KNN Method Housing Price.csv',row.names = FALSE)
After all the analysis we have done, we can see their training mse to predict what might be the best model for this dataset.
mse.lm
## [1] 0.01959431
mse.best
## [1] 0.01955967
mse.fwd
## [1] 0.01955967
mse.bwd
## [1] 0.01955967
mse.lasso
## [1] 0.01959865
mse.ridge
## [1] 0.01770622
mse.gam
## [1] 0.01229514
mse.tree
## [1] 0.04031398
mse.bag
## [1] 0.003282359
mse.random
## [1] 0.003662686
mse.gbm
## [1] 0.004839912
1-error.k
## [1] 0.004109589 0.004109589 0.009589041
From above data we can conclude, based on the training mse we might want to use bagging method in the testing dataset to for best saleprice prediction. However, We might want to comparing testing MSE or true error form testing dataset and see which one might actually perform the best.
From the true error we ca clear see that boosting method is the best method for this dataset because of the lowest true error and the relatively low training error among all methods.